In [1]:
#IMPORT THE LIBRARIES
import pandas as pd
import numpy as np
import os
import sys
# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
# to play the audio files
import IPython.display as ipd
from IPython.display import Audio
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM,BatchNormalization , GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
In [2]:
# path to the directory
RAVD = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
In [3]:
dirl_list = os.listdir(RAVD)
dirl_list.sort()
emotion = []
gender = []
path = []
for i in dirl_list:
fname = os.listdir(RAVD + i)
for f in fname:
part = f.split('.')[0].split('-')
emotion.append(int(part[2]))
temp = int(part[6])
if temp%2 == 0:
temp = "female"
else:
temp = "male"
gender.append(temp)
path.append(RAVD + i + '/' + f)
RAVD_df = pd.DataFrame(emotion)
RAVD_df = RAVD_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAVD_df = pd.concat([pd.DataFrame(gender),RAVD_df],axis=1)
RAVD_df.columns = ['gender','emotion']
RAVD_df['labels'] =RAVD_df.gender + '_' + RAVD_df.emotion
RAVD_df['source'] = 'RAVDESS'
RAVD_df = pd.concat([RAVD_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAVD_df = RAVD_df.drop(['gender', 'emotion'], axis=1)
RAVD_df.labels.value_counts()
Out[3]:
male_neutral 144 female_neutral 144 male_sad 96 male_fear 96 male_happy 96 male_disgust 96 male_angry 96 male_surprise 96 female_surprise 96 female_disgust 96 female_fear 96 female_sad 96 female_happy 96 female_angry 96 Name: labels, dtype: int64
In [4]:
plt.figure(figsize=(12, 5))
plt.title('Count of Emotions', size=16)
sns.countplot(RAVD_df.labels)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
plt.xticks(rotation=45)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
In [5]:
# Female Happy
fRA1= RAVD + 'Actor_08/03-01-03-02-02-01-08.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1)
Out[5]:
In [6]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000)
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Happy')
plt.colorbar(format='%+2.0f dB')
Out[6]:
<matplotlib.colorbar.Colorbar at 0x797a7a5f9b50>
In [7]:
#Female Fear
fRA2=RAVD +'Actor_08/03-01-06-01-01-01-08.wav'
data, sr = librosa.load(fRA2)
ipd.Audio(fRA2)
Out[7]:
In [8]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000)
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Fear')
plt.colorbar(format='%+2.0f dB');
Next, we compare the waveplots of happy and fearful tracks
In [9]:
# Female Disgust
fRA1 =RAVD +'Actor_20/03-01-08-02-02-02-20.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1)
Out[9]:
In [10]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000)
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Disgust')
plt.colorbar(format='%+2.0f dB');
In [11]:
# Male Fearfull
fRA1 = RAVD + 'Actor_19/03-01-04-01-02-01-19.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1)
Out[11]:
In [12]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000)
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Male Fearfull')
plt.colorbar(format='%+2.0f dB');
In [13]:
# Gender - Female; Emotion - Angry
path = "../input/ravdess-emotional-speech-audio/Actor_18/03-01-05-01-01-01-18.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
Out[13]:
In [14]:
# Gender - Male; Emotion - Angry
path = "../input/ravdess-emotional-speech-audio/Actor_17/03-01-05-01-01-02-17.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()
ipd.Audio(path)
Out[14]:
In [15]:
# Gender - Female; Emotion - angry
path = "../input/ravdess-emotional-speech-audio/Actor_18/03-01-05-01-01-01-18.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))
# Gender - Male; Emotion - angry
path = "../input/ravdess-emotional-speech-audio/Actor_17/03-01-05-01-01-02-17.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))
# Plot the two audio waves together
plt.figure(figsize=(16,10))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216 216
Out[15]:
<matplotlib.legend.Legend at 0x797a790ea490>
In [16]:
# Gender - Female; Emotion - Surprised
path = "../input/ravdess-emotional-speech-audio/Actor_20/03-01-08-02-01-02-20.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))
# Gender - Male; Emotion - Surprised
path = "../input/ravdess-emotional-speech-audio/Actor_21/03-01-08-02-01-01-21.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))
# Plot the two audio waves together
plt.figure(figsize=(16,10))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216 216
Out[16]:
<matplotlib.legend.Legend at 0x797a790bead0>
In [17]:
# NOISE
def noise(data):
noise_amp = 0.035*np.random.uniform()*np.amax(data)
data = data + noise_amp*np.random.normal(size=data.shape[0])
return data
# STRETCH
def stretch(data, rate=0.8):
return librosa.effects.time_stretch(data, rate)
# SHIFT
def shift(data):
shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
return np.roll(data, shift_range)
# PITCH
def pitch(data, sampling_rate, pitch_factor=0.7):
return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
In [18]:
# Trying different functions above
path = np.array(RAVD_df['path'])[471]
data, sample_rate = librosa.load(path)
In [19]:
# NORMAL AUDIO
import librosa.display
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=data, sr=sample_rate)
Audio(path)
Out[19]:
In [20]:
# AUDIO WITH NOISE
x = noise(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[20]:
In [21]:
# STRETCHED AUDIO
x = stretch(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[21]:
In [22]:
# SHIFTED AUDIO
x = shift(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[22]:
In [23]:
# AUDIO WITH PITCH
x = pitch(data, sample_rate)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[23]:
In [24]:
def feat_ext(data):
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
return mfcc
def get_feat(path):
data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
# normal data
res1 = feat_ext(data)
result = np.array(res1)
#data with noise
noise_data = noise(data)
res2 = feat_ext(noise_data)
result = np.vstack((result, res2))
#data with stretch and pitch
new_data = stretch(data)
data_stretch_pitch = pitch(new_data, sample_rate)
res3 = feat_ext(data_stretch_pitch)
result = np.vstack((result, res3))
return result
In [25]:
RAVD_df.head()
Out[25]:
| labels | source | path | |
|---|---|---|---|
| 0 | male_neutral | RAVDESS | /kaggle/input/ravdess-emotional-speech-audio/a... |
| 1 | male_neutral | RAVDESS | /kaggle/input/ravdess-emotional-speech-audio/a... |
| 2 | male_sad | RAVDESS | /kaggle/input/ravdess-emotional-speech-audio/a... |
| 3 | male_neutral | RAVDESS | /kaggle/input/ravdess-emotional-speech-audio/a... |
| 4 | male_neutral | RAVDESS | /kaggle/input/ravdess-emotional-speech-audio/a... |
In [26]:
X, Y = [], []
for path, emotion in zip(RAVD_df['path'], RAVD_df['labels']):
feature = get_feat(path)
for ele in feature:
X.append(ele)
Y.append(emotion)
In [27]:
Emotions = pd.DataFrame(X)
Emotions['labels'] = Y
Emotions.to_csv('emotion.csv', index=False)
Emotions.head()
Out[27]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -626.266724 | 93.891739 | -0.696724 | 17.828402 | 9.496767 | 2.025836 | -2.726057 | -8.519138 | -12.432029 | -6.580182 | ... | -2.754473 | 0.774303 | -5.368925 | -0.340401 | 1.479823 | -8.706111 | -2.767464 | -1.620493 | -1.525633 | male_neutral |
| 1 | -424.219659 | 33.409893 | 13.512744 | 9.723939 | 4.621845 | 1.812571 | -1.768389 | -6.672808 | -7.872926 | -5.476621 | ... | -0.298616 | -1.024360 | -1.842204 | -0.319954 | -0.260445 | -4.467974 | -3.670608 | 0.103759 | -2.584844 | male_neutral |
| 2 | -680.248840 | 90.474678 | -2.995482 | 17.773315 | 6.315861 | 0.721663 | -6.446163 | -11.472776 | -14.421964 | -4.905107 | ... | -1.993616 | -0.631891 | -6.631033 | 1.132353 | -2.568039 | -8.887710 | -1.243952 | -2.682266 | -6.088979 | male_neutral |
| 3 | -634.959839 | 72.811478 | -3.487027 | 20.697269 | 10.188320 | -0.667840 | -3.293633 | -7.447816 | -16.703850 | -2.161060 | ... | -3.714514 | 0.273592 | -4.517450 | -1.117245 | 0.534381 | -6.885534 | -1.295200 | -3.289555 | 0.756877 | male_neutral |
| 4 | -453.426387 | 31.440529 | 8.427989 | 11.046314 | 5.784891 | -1.362626 | -2.919144 | -6.799702 | -9.867522 | -5.229569 | ... | -1.979740 | -1.493574 | -2.101019 | -1.031402 | -0.849473 | -4.312014 | -3.388788 | -0.723249 | -1.842601 | male_neutral |
5 rows × 21 columns
In [28]:
# can use this directly from saved feature .csv file
Emotions = pd.read_csv('./emotion.csv')
Emotions.head()
Out[28]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -626.266724 | 93.891739 | -0.696724 | 17.828402 | 9.496767 | 2.025836 | -2.726057 | -8.519138 | -12.432029 | -6.580182 | ... | -2.754473 | 0.774303 | -5.368925 | -0.340401 | 1.479823 | -8.706111 | -2.767464 | -1.620493 | -1.525633 | male_neutral |
| 1 | -424.219659 | 33.409893 | 13.512744 | 9.723939 | 4.621845 | 1.812571 | -1.768389 | -6.672808 | -7.872926 | -5.476621 | ... | -0.298616 | -1.024360 | -1.842204 | -0.319954 | -0.260445 | -4.467974 | -3.670608 | 0.103759 | -2.584844 | male_neutral |
| 2 | -680.248840 | 90.474678 | -2.995482 | 17.773315 | 6.315861 | 0.721663 | -6.446163 | -11.472776 | -14.421964 | -4.905107 | ... | -1.993616 | -0.631891 | -6.631033 | 1.132353 | -2.568039 | -8.887710 | -1.243952 | -2.682266 | -6.088979 | male_neutral |
| 3 | -634.959839 | 72.811478 | -3.487027 | 20.697269 | 10.188320 | -0.667840 | -3.293633 | -7.447816 | -16.703850 | -2.161060 | ... | -3.714514 | 0.273592 | -4.517450 | -1.117245 | 0.534381 | -6.885534 | -1.295200 | -3.289555 | 0.756877 | male_neutral |
| 4 | -453.426387 | 31.440529 | 8.427989 | 11.046314 | 5.784891 | -1.362626 | -2.919144 | -6.799702 | -9.867522 | -5.229569 | ... | -1.979740 | -1.493574 | -2.101019 | -1.031402 | -0.849473 | -4.312014 | -3.388788 | -0.723249 | -1.842601 | male_neutral |
5 rows × 21 columns
In [29]:
X = Emotions.iloc[: ,:-1].values
Y = Emotions['labels'].values
In [30]:
# As this is a multiclass classification problem onehotencoding our Y
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
In [31]:
# Train and Test Split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
Out[31]:
((3240, 20), (3240, 14), (1080, 20), (1080, 14))
In [32]:
# Reshape for LSTM
X_train = x_train.reshape(x_train.shape[0] , x_train.shape[1] , 1)
X_test = x_test.reshape(x_test.shape[0] , x_test.shape[1] , 1)
In [33]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
Out[33]:
((3240, 20), (3240, 14), (1080, 20), (1080, 14))
In [42]:
# Additional imports for advanced feature extraction
import scipy
from scipy.fftpack import dct
import scipy.signal as signal
from scipy.fftpack import dct
def extract_plp(audio, sr, n_coeff=13):
"""
Extract Perceptual Linear Prediction coefficients
PLP is based on human auditory perception characteristics
"""
# Pre-emphasis filter
pre_emphasis = 0.97
emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
# Frame the signal
frame_size = 0.025
frame_stride = 0.01
frame_length = int(round(frame_size * sr))
frame_step = int(round(frame_stride * sr))
signal_length = len(emphasized_audio)
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_audio, z)
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
# Apply hamming window
frames *= np.hamming(frame_length)
# Power spectrum
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
# Mel scale filter banks
nfilt = 26
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
hz_points = (700 * (10**(mel_points / 2595) - 1))
bin_points = np.floor((NFFT + 1) * hz_points / sr)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin_points[m - 1])
f_m = int(bin_points[m])
f_m_plus = int(bin_points[m + 1])
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
filter_banks = np.log(filter_banks)
# Apply DCT to get PLP coefficients
plp = dct(filter_banks, type=2, axis=1, norm='ortho')[:, :n_coeff]
return np.mean(plp, axis=0)
In [43]:
def extract_lpcc(audio, sr, n_coeff=13):
"""
Extract Linear Prediction Cepstral Coefficients
LPCC captures the spectral envelope of the speech signal
"""
# Pre-emphasis
pre_emphasis = 0.97
emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
# Framing
frame_size = 0.025
frame_stride = 0.01
frame_length = int(round(frame_size * sr))
frame_step = int(round(frame_stride * sr))
signal_length = len(emphasized_audio)
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_audio, z)
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
# Apply window
frames *= np.hamming(frame_length)
# LPC analysis
lpc_order = n_coeff
lpcc_features = []
for frame in frames:
# Calculate autocorrelation
autocorr = np.correlate(frame, frame, mode='full')
autocorr = autocorr[len(autocorr)//2:]
# Levinson-Durbin recursion for LPC coefficients
if len(autocorr) > lpc_order and autocorr[0] != 0:
lpc_coeffs = np.zeros(lpc_order + 1)
lpc_coeffs[0] = 1.0
error = autocorr[0]
for i in range(1, lpc_order + 1):
lambda_val = -np.sum(lpc_coeffs[:i] * autocorr[i:0:-1]) / error
lpc_coeffs[1:i+1] += lambda_val * lpc_coeffs[i-1::-1]
lpc_coeffs[i] = lambda_val
error *= (1 - lambda_val**2)
# Convert LPC to LPCC
lpcc = np.zeros(n_coeff)
lpcc[0] = -lpc_coeffs[1]
for n in range(2, n_coeff + 1):
sum_val = 0
for k in range(1, n):
sum_val += (k / n) * lpcc[k-1] * lpc_coeffs[n-k]
lpcc[n-1] = -lpc_coeffs[n] - sum_val
lpcc_features.append(lpcc)
if len(lpcc_features) > 0:
return np.mean(lpcc_features, axis=0)
else:
return np.zeros(n_coeff)
In [44]:
def extract_gfcc(audio, sr, n_coeff=13):
"""
Extract Gammatone Frequency Cepstral Coefficients
GFCC mimics the human auditory system more closely than MFCC
"""
# Parameters
frame_size = 0.025
frame_stride = 0.01
frame_length = int(round(frame_size * sr))
frame_step = int(round(frame_stride * sr))
NFFT = 512
# Pre-emphasis
pre_emphasis = 0.97
emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
# Framing
signal_length = len(emphasized_audio)
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_audio, z)
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
# Window
frames *= np.hamming(frame_length)
# Power spectrum
mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
# Gammatone-like filter bank (ERB scale approximation)
nfilt = 26
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
hz_points = (700 * (10**(mel_points / 2595) - 1))
bin_points = np.floor((NFFT + 1) * hz_points / sr)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin_points[m - 1])
f_m = int(bin_points[m])
f_m_plus = int(bin_points[m + 1])
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
# Power law compression (mimics cochlear compression)
filter_banks = np.power(filter_banks, 0.33)
filter_banks = np.log(filter_banks + 1e-8)
# DCT for GFCC
gfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, :n_coeff]
return np.mean(gfcc, axis=0)
In [45]:
def extract_all_features(path, feature_type='mfcc'):
"""
Extract features based on specified type with augmentation
Feature types: 'mfcc', 'plp', 'lpcc', 'gfcc'
"""
data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
# Extract based on feature type
if feature_type == 'mfcc':
res1 = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=20).T, axis=0)
elif feature_type == 'plp':
res1 = extract_plp(data, sample_rate, n_coeff=20)
elif feature_type == 'lpcc':
res1 = extract_lpcc(data, sample_rate, n_coeff=20)
elif feature_type == 'gfcc':
res1 = extract_gfcc(data, sample_rate, n_coeff=20)
result = np.array(res1)
# Augmentation with noise
noise_data = noise(data)
if feature_type == 'mfcc':
res2 = np.mean(librosa.feature.mfcc(y=noise_data, sr=sample_rate, n_mfcc=20).T, axis=0)
elif feature_type == 'plp':
res2 = extract_plp(noise_data, sample_rate, n_coeff=20)
elif feature_type == 'lpcc':
res2 = extract_lpcc(noise_data, sample_rate, n_coeff=20)
elif feature_type == 'gfcc':
res2 = extract_gfcc(noise_data, sample_rate, n_coeff=20)
result = np.vstack((result, res2))
# Augmentation with stretch and pitch
new_data = stretch(data)
data_stretch_pitch = pitch(new_data, sample_rate)
if feature_type == 'mfcc':
res3 = np.mean(librosa.feature.mfcc(y=data_stretch_pitch, sr=sample_rate, n_mfcc=20).T, axis=0)
elif feature_type == 'plp':
res3 = extract_plp(data_stretch_pitch, sample_rate, n_coeff=20)
elif feature_type == 'lpcc':
res3 = extract_lpcc(data_stretch_pitch, sample_rate, n_coeff=20)
elif feature_type == 'gfcc':
res3 = extract_gfcc(data_stretch_pitch, sample_rate, n_coeff=20)
result = np.vstack((result, res3))
return result
In [46]:
# Dictionary to store features for each type
feature_sets = {}
feature_types = ['mfcc', 'plp', 'lpcc', 'gfcc']
for feat_type in feature_types:
print(f"Extracting {feat_type.upper()} features...")
X_feat, Y_feat = [], []
for path, emotion in zip(RAVD_df['path'], RAVD_df['labels']):
feature = extract_all_features(path, feature_type=feat_type)
for ele in feature:
X_feat.append(ele)
Y_feat.append(emotion)
feature_sets[feat_type] = {'X': np.array(X_feat), 'Y': np.array(Y_feat)}
print(f"{feat_type.upper()}: Shape {feature_sets[feat_type]['X'].shape}")
Extracting MFCC features... MFCC: Shape (4320, 20) Extracting PLP features... PLP: Shape (4320, 20) Extracting LPCC features... LPCC: Shape (4320, 20) Extracting GFCC features... GFCC: Shape (4320, 20)
In [48]:
def build_model(input_shape, num_classes):
"""
Build CNN-LSTM hybrid model for emotion classification
"""
model = Sequential()
# CNN layers
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
# LSTM layers
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
In [49]:
results = {}
for feat_type in feature_types:
print(f"\n{'='*60}")
print(f"Training model with {feat_type.upper()} features")
print(f"{'='*60}")
# Prepare data
X = feature_sets[feat_type]['X']
Y = feature_sets[feat_type]['Y']
# One-hot encode labels
encoder = OneHotEncoder()
Y_encoded = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, Y_encoded,
test_size=0.25,
random_state=42,
shuffle=True)
# Scale data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
# Reshape for CNN-LSTM
x_train_reshaped = x_train_scaled.reshape(x_train_scaled.shape[0], x_train_scaled.shape[1], 1)
x_test_reshaped = x_test_scaled.reshape(x_test_scaled.shape[0], x_test_scaled.shape[1], 1)
print(f"Train shape: {x_train_reshaped.shape}, Test shape: {x_test_reshaped.shape}")
# Build model
model = build_model((x_train_reshaped.shape[1], 1), y_train.shape[1])
# Train model
history = model.fit(x_train_reshaped, y_train,
batch_size=64,
epochs=100,
validation_data=(x_test_reshaped, y_test),
verbose=0)
# Evaluate
test_loss, test_accuracy = model.evaluate(x_test_reshaped, y_test, verbose=0)
# Store results
results[feat_type] = {
'model': model,
'history': history,
'test_accuracy': test_accuracy,
'test_loss': test_loss,
'x_test': x_test_reshaped,
'y_test': y_test,
'scaler': scaler,
'encoder': encoder
}
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")
============================================================ Training model with MFCC features ============================================================ Train shape: (3240, 20, 1), Test shape: (1080, 20, 1) Test Accuracy: 82.96% Test Loss: 0.7929 ============================================================ Training model with PLP features ============================================================ Train shape: (3240, 20, 1), Test shape: (1080, 20, 1) Test Accuracy: 79.54% Test Loss: 0.8523 ============================================================ Training model with LPCC features ============================================================ Train shape: (3240, 20, 1), Test shape: (1080, 20, 1) Test Accuracy: 68.70% Test Loss: 1.3467 ============================================================ Training model with GFCC features ============================================================ Train shape: (3240, 20, 1), Test shape: (1080, 20, 1) Test Accuracy: 81.48% Test Loss: 0.7513
In [50]:
comparison_data = []
for feat_type in feature_types:
comparison_data.append({
'Feature Type': feat_type.upper(),
'Test Accuracy (%)': round(results[feat_type]['test_accuracy'] * 100, 2),
'Test Loss': round(results[feat_type]['test_loss'], 4)
})
comparison_df = pd.DataFrame(comparison_data)
print("\nPerformance Comparison:")
print(comparison_df.to_string(index=False))
Performance Comparison:
Feature Type Test Accuracy (%) Test Loss
MFCC 82.96 0.7929
PLP 79.54 0.8523
LPCC 68.70 1.3467
GFCC 81.48 0.7513
In [51]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()
for idx, feat_type in enumerate(feature_types):
history = results[feat_type]['history']
axes[idx].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
axes[idx].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[idx].set_title(f'{feat_type.upper()} - Accuracy', fontsize=12, fontweight='bold')
axes[idx].set_xlabel('Epoch')
axes[idx].set_ylabel('Accuracy')
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [52]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()
for idx, feat_type in enumerate(feature_types):
history = results[feat_type]['history']
axes[idx].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[idx].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[idx].set_title(f'{feat_type.upper()} - Loss', fontsize=12, fontweight='bold')
axes[idx].set_xlabel('Epoch')
axes[idx].set_ylabel('Loss')
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [53]:
# Find best performing feature
best_feat = max(results.items(), key=lambda x: x[1]['test_accuracy'])
best_feat_type = best_feat[0]
best_model = best_feat[1]['model']
print(f"Best Feature: {best_feat_type.upper()}")
print(f"Accuracy: {best_feat[1]['test_accuracy']*100:.2f}%\n")
# Predictions
y_pred = best_model.predict(results[best_feat_type]['x_test'], verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(results[best_feat_type]['y_test'], axis=1)
# Confusion matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title(f'Confusion Matrix - {best_feat_type.upper()}', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()
Best Feature: MFCC Accuracy: 82.96%
In [54]:
emotion_labels = np.unique(RAVD_df['labels'])
print(f"Classification Report for {best_feat_type.upper()}:\n")
print(classification_report(y_true_classes, y_pred_classes,
target_names=emotion_labels))
Classification Report for MFCC:
precision recall f1-score support
female_angry 0.93 0.79 0.85 84
female_disgust 0.85 0.79 0.82 71
female_fear 0.88 0.85 0.86 78
female_happy 0.78 0.87 0.82 67
female_neutral 0.78 0.96 0.86 96
female_sad 0.84 0.69 0.76 74
female_surprise 0.85 0.91 0.88 64
male_angry 0.85 0.93 0.89 74
male_disgust 0.91 0.63 0.74 78
male_fear 0.81 0.80 0.81 70
male_happy 0.82 0.77 0.79 75
male_neutral 0.78 0.94 0.85 109
male_sad 0.77 0.76 0.76 70
male_surprise 0.85 0.89 0.87 70
accuracy 0.83 1080
macro avg 0.84 0.83 0.83 1080
weighted avg 0.83 0.83 0.83 1080
In [55]:
plt.figure(figsize=(10, 6))
features = [f.upper() for f in feature_types]
accuracies = [results[f]['test_accuracy']*100 for f in feature_types]
bars = plt.bar(features, accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
plt.xlabel('Feature Type', fontsize=12, fontweight='bold')
plt.ylabel('Test Accuracy (%)', fontsize=12, fontweight='bold')
plt.title('Model Performance Across Different Features', fontsize=14, fontweight='bold')
plt.ylim([0, 100])
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{height:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
In [56]:
# Save the best model
model_name = f'best_model_{best_feat_type}.keras'
save_dir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
best_model.save(model_path)
print(f'Saved best model ({best_feat_type.upper()}) at {model_path}')
Saved best model (MFCC) at /kaggle/working/saved_models/best_model_mfcc.keras
In [57]:
print("\n" + "="*60)
print("SUMMARY OF RESULTS")
print("="*60)
for feat_type in feature_types:
acc = results[feat_type]['test_accuracy'] * 100
loss = results[feat_type]['test_loss']
print(f"{feat_type.upper():12s} - Accuracy: {acc:6.2f}% | Loss: {loss:.4f}")
print("="*60)
print(f"Best Model: {best_feat_type.upper()} with {best_feat[1]['test_accuracy']*100:.2f}% accuracy")
print("="*60)
============================================================ SUMMARY OF RESULTS ============================================================ MFCC - Accuracy: 82.96% | Loss: 0.7929 PLP - Accuracy: 79.54% | Loss: 0.8523 LPCC - Accuracy: 68.70% | Loss: 1.3467 GFCC - Accuracy: 81.48% | Loss: 0.7513 ============================================================ Best Model: MFCC with 82.96% accuracy ============================================================
In [ ]: